# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("/storage"))
# Any results you write to the current directory are saved as output.
import os, sys
import re
from multiprocessing import Pool
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import skimage.io
from skimage.transform import resize
from scipy import ndimage
from imgaug import augmenters as iaa
from tqdm import tqdm
import PIL
from PIL import Image, ImageOps
import cv2
from sklearn.utils import class_weight, shuffle
from sklearn.metrics import f1_score, fbeta_score
from sklearn.model_selection import train_test_split
import seaborn as sns
from IPython.display import Image as ShowImage
from collections import Counter
import imageio
WORKERS = 2
CHANNEL = 3
import warnings
warnings.filterwarnings("ignore")
IMG_SIZE = 512
NUM_CLASSES = 5
SEED = 77
TRAIN_NUM = 1000 # use 1000 when you just want to explore new idea, use -1 for full train
df_train = pd.read_csv('/storage/aptos2019-blindness-detection/train.csv')
df_test = pd.read_csv('/storage/aptos2019-blindness-detection/test.csv')
x = df_train['id_code']
y = df_train['diagnosis']
x, y = shuffle(x, y, random_state=SEED)
df_test[:10]
def resolution_dist(df, img_sub_dir):
channels_count = Counter()
height_count = Counter()
width_count = Counter()
height_width_count = Counter()
for i, row in df.iterrows():
path=f"/storage/aptos2019-blindness-detection/{img_sub_dir}/{row['id_code']}.png"
height, width, channels = imageio.imread(path).shape
channels_count.update([channels])
height_count.update([height])
width_count.update([width])
height_width_count.update([f'h{height}_w{width}'])
print(channels_count)
print(height_count)
print(width_count)
print(height_width_count)
return channels_count, height_count, width_count, height_width_count
df_train.hist()
train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.05,
stratify=y, random_state=SEED)
print(train_x.shape, train_y.shape, valid_x.shape, valid_y.shape)
train_y.hist()
valid_y.hist()
How do we know that a patient have diabetic retinopahy? There are at least 5 things to spot on. Image credit https://www.eyeops.com/

From quick investigations of the data (see various pictures below), I found that Hemorrphages, Hard Exudates and Cotton Wool spots are quite easily observed. However, I still could not find examples of Aneurysm or Abnormal Growth of Blood Vessels from our data yet. Perhaps the latter two cases are important if we want to catch up human benchmnark using our model.
First, let have a glance of original inputs. Each row depicts each severity level. We can see two problems which make the severity difficult to spot on. First, some images are very dark [pic(0,2) and pic(4,4) ] and sometimes different color illumination is confusing [pic (3,3)]. Second, we can get the uninformative dark areas for some pictures [pic(0,1), pic(0,3)]. This is important when we reduce the picture size, as informative areas become too small. So it is intuitive to crop the uninformative areas out in the second case.
%%time
fig = plt.figure(figsize=(25, 16))
# display 10 images from each class
for class_id in sorted(train_y.unique()):
for i, (idx, row) in enumerate(df_train.loc[df_train['diagnosis'] == class_id].sample(5, random_state=SEED).iterrows()):
ax = fig.add_subplot(5, 5, class_id * 5 + i + 1, xticks=[], yticks=[])
path=f"/storage/aptos2019-blindness-detection/train_images/{row['id_code']}.png"
image = cv2.imread(path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
plt.imshow(image)
ax.set_title('Label: %d-%d-%s' % (class_id, idx, row['id_code']) )
We can try gray scale and feel understand better for some pictures, as color distraction is gone. For example, we can see more blood clearer in the upper part of pic(4,4), which has severity of level 4.
%%time
fig = plt.figure(figsize=(25, 16))
for class_id in sorted(train_y.unique()):
for i, (idx, row) in enumerate(df_train.loc[df_train['diagnosis'] == class_id].sample(5, random_state=SEED).iterrows()):
ax = fig.add_subplot(5, 5, class_id * 5 + i + 1, xticks=[], yticks=[])
path=f"/storage/aptos2019-blindness-detection/train_images/{row['id_code']}.png"
image = cv2.imread(path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# image=cv2.addWeighted ( image, 0 , cv2.GaussianBlur( image , (0 ,0 ) , 10) ,-4 ,128)
image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
plt.imshow(image, cmap='gray')
ax.set_title('Label: %d-%d-%s' % (class_id, idx, row['id_code']) )
For severity level 4, I feel that two examples here are difficult to spot on, pic(4,1) and pic(4,4). As we try zooming to see the details (use real size image), we can see some abnormalities (cotton wool spots or hard exudates ?) in those eyes clearer (observe the lower-right part of the eye). Therefore, IMG_SIZE is definitely important for this problem. In the next section, we shall see better method than gray-scale conversion.
dpi = 80 #inch
# path=f"../input/aptos2019-blindness-detection/train_images/5c7ab966a3ee.png" # notice upper part
path=f"/storage/aptos2019-blindness-detection/train_images/cd54d022e37d.png" # lower-right, this still looks not so severe, can be class3
image = cv2.imread(path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
height, width = image.shape
print(height, width)
SCALE=2
figsize = (width / float(dpi))/SCALE, (height / float(dpi))/SCALE
fig = plt.figure(figsize=figsize)
plt.imshow(image, cmap='gray')
In the last competition, Ben Graham (last competition's winner) share insightful way to improve lighting condition. Here, we apply his idea, and can see many important details in the eyes much better. For full details, please refer to his technical report in the link above.
%%time
fig = plt.figure(figsize=(25, 16))
for class_id in sorted(train_y.unique()):
for i, (idx, row) in enumerate(df_train.loc[df_train['diagnosis'] == class_id].sample(5, random_state=SEED).iterrows()):
ax = fig.add_subplot(5, 5, class_id * 5 + i + 1, xticks=[], yticks=[])
path=f"/storage/aptos2019-blindness-detection/train_images/{row['id_code']}.png"
image = cv2.imread(path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
image=cv2.addWeighted ( image,4, cv2.GaussianBlur( image , (0,0) , IMG_SIZE/10) ,-4 ,128) # the trick is to add this line
plt.imshow(image, cmap='gray')
ax.set_title('Label: %d-%d-%s' % (class_id, idx, row['id_code']) )
To crop out the uninformative black areas which are evident on pic(0,1), pic(0,3) and pic(4,1), we can try auto cropping. I found 4 alternative codes from https://stackoverflow.com/questions/13538748/crop-black-edges-with-opencv and https://codereview.stackexchange.com/questions/132914/crop-black-border-of-image-using-numpy/132934 ... Fortunately one method works perfectly for a gray scale image, but none works on a color image. In this kernel, I modify the method working on gray-scale a bit to make it suitable for a color image.
def crop_image1(img,tol=7):
# img is image data
# tol is tolerance
mask = img>tol
return img[np.ix_(mask.any(1),mask.any(0))]
def crop_image_from_gray(img,tol=7):
if img.ndim ==2:
mask = img>tol
return img[np.ix_(mask.any(1),mask.any(0))]
elif img.ndim==3:
gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
mask = gray_img>tol
check_shape = img[:,:,0][np.ix_(mask.any(1),mask.any(0))].shape[0]
if (check_shape == 0): # image is too dark so that we crop out everything,
return img # return original image
else:
img1=img[:,:,0][np.ix_(mask.any(1),mask.any(0))]
img2=img[:,:,1][np.ix_(mask.any(1),mask.any(0))]
img3=img[:,:,2][np.ix_(mask.any(1),mask.any(0))]
# print(img1.shape,img2.shape,img3.shape)
img = np.stack([img1,img2,img3],axis=-1)
# print(img.shape)
return img
I have tested on around 200 images, and the method works great. However, if anybody find the outlier cases which cause the auto crop to fail, please let me know. I think now the eye pictures are very like the moon by the way :)
IMPORTANT UPDATE on Kernel V.9 I found that there is indeed a case in private test set making the old version of crop function fail. (I spent my 13 submissions until I found this bug) E.g. if there is an adversarial image (super dark) in the private test set, the crop function will crop everything and result in 0 dimension image. I have fixed this bug in this kernel version, but I still could not guarantee whether there are other cases in a private test that will make the crop function fail or not. Update on V11 Now I was able to have a valid LB score with the new crop function, so if anybody still have some submission errors, that is the reason of other bugs.
At first, when I wrote this kernel, I could not make a color crop nicely, so I thought that gray scale is a better representation. Now I believe that color version is better, so from this point on I will use color cropping
Below is the cropped of the color version. For color version, note that I use argument sigmaX = 30 of cv2.GaussianBlur, where Ben actually used sigmaX = 10 which may have better performance. I just feel that this sigmaX = 30 or sigmaX = 50 make beautiful [sometimes bloody] yellow moon pictures. Just for the purpose of illustration.
Please refer to https://www.tutorialkart.com/opencv/python/opencv-python-gaussian-image-smoothing/ .
def load_ben_color(path, sigmaX=10):
image = cv2.imread(path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = crop_image_from_gray(image)
image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
image=cv2.addWeighted ( image,4, cv2.GaussianBlur( image , (0,0) , sigmaX) ,-4 ,128)
return image
%%time
NUM_SAMP=7
fig = plt.figure(figsize=(25, 16))
for class_id in sorted(train_y.unique()):
for i, (idx, row) in enumerate(df_train.loc[df_train['diagnosis'] == class_id].sample(NUM_SAMP, random_state=SEED).iterrows()):
ax = fig.add_subplot(5, NUM_SAMP, class_id * NUM_SAMP + i + 1, xticks=[], yticks=[])
path=f"/storage/aptos2019-blindness-detection/train_images/{row['id_code']}.png"
image = load_ben_color(path,sigmaX=30)
plt.imshow(image)
ax.set_title('%d-%d-%s' % (class_id, idx, row['id_code']) )
@taindow proposes an interesting idea of making a circle crop to the image, so I update the kernel to let you compare the results. Credit : https://www.kaggle.com/taindow/pre-processing-train-and-test-images ... Observe that we now get a magic circle, but by using circle crop, some scabs/wools may get loss.
def circle_crop(img, sigmaX=10):
"""
Create circular crop around image centre
"""
img = cv2.imread(img)
img = crop_image_from_gray(img)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
height, width, depth = img.shape
x = int(width/2)
y = int(height/2)
r = np.amin((x,y))
circle_img = np.zeros((height, width), np.uint8)
cv2.circle(circle_img, (x,y), int(r), 1, thickness=-1)
img = cv2.bitwise_and(img, img, mask=circle_img)
img = crop_image_from_gray(img)
img=cv2.addWeighted ( img,4, cv2.GaussianBlur( img , (0,0) , sigmaX) ,-4 ,128)
return img
%%time
## try circle crop
NUM_SAMP=1
for class_id in [1, 2]:
for i, (idx, row) in enumerate(df_train.loc[df_train['diagnosis'] == class_id].sample(NUM_SAMP, random_state=SEED).iterrows()):
ax = fig.add_subplot(5, NUM_SAMP, class_id * NUM_SAMP + i + 1, xticks=[], yticks=[])
path=f"/storage/aptos2019-blindness-detection/train_images/{row['id_code']}.png"
image = circle_crop(path,sigmaX=30)
image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
plt.imshow(image)
We can try plotting a picture (sample train pic(4,1) above) with IMG_SIZE with cropping, now important information is much clearer to see with sigmaX = 10
dpi = 80 #inch
# path=f"../input/aptos2019-blindness-detection/train_images/5c7ab966a3ee.png" # notice upper part
path=f"/storage/aptos2019-blindness-detection/train_images/cd54d022e37d.png" # lower-right, can be class3
image = load_ben_color(path,sigmaX=10)
height, width = IMG_SIZE, IMG_SIZE
print(height, width)
SCALE=1
figsize = (width / float(dpi))/SCALE, (height / float(dpi))/SCALE
fig = plt.figure(figsize=figsize)
plt.imshow(image, cmap='gray')
We can also try auto cropping on 50 test data to see that it work fine. Below, we see immediately from this random samples that severed cases, with level >2, are relatively many more compared to the training set.
%%time
NUM_SAMP=10
fig = plt.figure(figsize=(25, 16))
for jj in range(5):
for i, (idx, row) in enumerate(df_test.sample(NUM_SAMP,random_state=SEED+jj).iterrows()):
ax = fig.add_subplot(5, NUM_SAMP, jj * NUM_SAMP + i + 1, xticks=[], yticks=[])
path=f"/storage/aptos2019-blindness-detection/test_images/{row['id_code']}.png"
image = load_ben_color(path,sigmaX=30)
plt.imshow(image)
ax.set_title('%d-%s' % (idx, row['id_code']) )
%%time
'''Bonus : sigmaX=50'''
NUM_SAMP=10
fig = plt.figure(figsize=(25, 16))
for jj in range(5):
for i, (idx, row) in enumerate(df_test.sample(NUM_SAMP,random_state=SEED+jj).iterrows()):
ax = fig.add_subplot(5, NUM_SAMP, jj * NUM_SAMP + i + 1, xticks=[], yticks=[])
path=f"/storage/aptos2019-blindness-detection/test_images/{row['id_code']}.png"
image = load_ben_color(path,sigmaX=50)
plt.imshow(image, cmap='gray')
ax.set_title('%d-%s' % (idx, row['id_code']) )
Thanks @tanlikesmath, https://www.kaggle.com/tanlikesmath/diabetic-retinopathy-resized who provides us a complete previous-competition dataset in the .jpeg format; this is much smaller than the original version with the risk of losing image details. Let apply both normal gray scale, and Ben Graham's method to this dataset.
!ls /storage/diabetic-retinopathy-resized/
!ls /storage/diabetic-retinopathy-resized/resized_train/resized_train | head
df_old = pd.read_csv('/storage/retinopathy-train-2015/trainLabels.csv')
df_old.head()
NUM_SAMP=10
fig = plt.figure(figsize=(25, 16))
for class_id in sorted(train_y.unique()):
for i, (idx, row) in enumerate(df_old.loc[df_old['level'] == class_id].sample(NUM_SAMP, random_state=SEED).iterrows()):
ax = fig.add_subplot(5, NUM_SAMP, class_id * NUM_SAMP + i + 1, xticks=[], yticks=[])
path=f"/storage/retinopathy-train-2015/rescaled_train_896/{row['image']}.png"
image = load_ben_color(path,sigmaX=10)
plt.imshow(image)
ax.set_title('%d-%d-%s' % (class_id, idx, row['image']) )
Below is the unpreprocess version, just for comparison
NUM_SAMP=10
fig = plt.figure(figsize=(25, 16))
for class_id in sorted(train_y.unique()):
for i, (idx, row) in enumerate(df_old.loc[df_old['level'] == class_id].sample(NUM_SAMP, random_state=SEED).iterrows()):
ax = fig.add_subplot(5, NUM_SAMP, class_id * NUM_SAMP + i + 1, xticks=[], yticks=[])
print(row['image'])
path=f"/storage/diabetic-retinopathy-resized/resized_train/resized_train/{row['image']}.jpeg"
image = cv2.imread(path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# image = crop_image_from_gray(image)
image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
# image=cv2.addWeighted ( image,4, cv2.GaussianBlur( image , (0,0) , IMG_SIZE/10) ,-4 ,128)
plt.imshow(image, cmap='gray')
ax.set_title('%d-%d-%s' % (class_id, idx, row['image']) )
Ok preprocessing methods seem to works fine; however, the doctors to estimate the severity levels in the past competitions may have different criteria in mind than the doctors of Aravind, so it is possible to have some estimation inconsistency (at least to my eyes the previous data seems more noisy). The following level-4 [pic(4,1) in the plot we just made above] looks not so severe. (Or this is the example case of too many blood vessels ??, refer to Section 1.1)
dpi = 80 #inch
path=f"/storage/diabetic-retinopathy-resized/resized_train/resized_train/31590_right.jpeg" # too many vessels?
# path=f"../input/diabetic-retinopathy-resized/resized_train/resized_train/18017_left.jpeg" # details are lost
image = load_ben_color(path,sigmaX=30)
# image = cv2.imread(path)
# image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# image = crop_image1(image)
# image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
# image=cv2.addWeighted ( image,4, cv2.GaussianBlur( image , (0,0) , IMG_SIZE/10) ,-4 ,128)
height, width = IMG_SIZE, IMG_SIZE
print(height, width)
SCALE=1
figsize = (width / float(dpi))/SCALE, (height / float(dpi))/SCALE
fig = plt.figure(figsize=figsize)
plt.imshow(image, cmap='gray')
df_train2 = pd.read_csv('/storage/aptos2019-blindness-detection/train.csv')
df_train2.head()
df_train2.hist()
df_train2.groupby('diagnosis').agg(['count'])
df_2015_1 = pd.read_csv('/storage/diabetic-retinopathy-resized/trainLabels.csv')
df_2015_1.head()
df_2015_1.groupby('level').agg(['count'])
df_2015_2 = pd.read_csv('/storage/diabetic-retinopathy-resized/trainLabels_cropped.csv')
df_2015_2.head()
df_2015_2.groupby('level').agg(['count'])
!ls /storage/retinopathy-train-2015
df_2015_3 = pd.read_csv('/storage/retinopathy-train-2015/trainLabels.csv')
df_2015_3.head()
df_2015_3.groupby('level').agg(['count'])
We will merge the original dataset with the retinopathy-train-2015 dataset for our train dataset.
To do this, we'll need to proceed with the following steps:
root_path = '/storage'
df_train1 = pd.read_csv(f'{root_path}/aptos2019-blindness-detection/train.csv')
df_test = pd.read_csv(f'{root_path}/aptos2019-blindness-detection/test.csv')
df_train2 = pd.read_csv(f'{root_path}/retinopathy-train-2015/trainLabels.csv')
df_train1.head(1)
df_train2.head(1)
df_train2 = df_train2.rename(columns={'image': 'id_code', 'level': 'diagnosis'})
df_train2.head(1)
df_train_all = pd.concat([df_train1, df_train2])
df_train_all
!mkdir -p /storage/aptosplus
!mkdir -p /storage/aptosplus/aptos
!mkdir -p /storage/aptosplus/eyepacs
eyepacs = re.compile(r'(left|right)')
def set_ds(c):
id_code = c['id_code']
return 'eyepacs' if eyepacs.search(id_code) else 'aptos'
df_train_all['ds'] = df_train_all.apply(set_ds, axis=1)
df_train_all.head()
df_train_all.to_csv('/storage/aptosplus/trainLabels.csv', index=False)
!head -10 /storage/aptosplus/trainLabels.csv
def write_images_to_disk_multi(packed_args):
i, row = packed_args
if row['ds'] == 'aptos':
image = load_ben_color(f'{root_path}/aptos2019-blindness-detection/train_images/{row["id_code"]}.png')
cv2.imwrite(f'{root_path}/aptosplus/aptos/{row["id_code"]}.png', image)
else:
image = load_ben_color(f'{root_path}/retinopathy-train-2015/rescaled_train_896/{row["id_code"]}.png')
cv2.imwrite(f'{root_path}/aptosplus/eyepacs/{row["id_code"]}.png', image)
%%time
pool = Pool()
pool.map(write_images_to_disk_multi, df_train_all.iterrows())